SE object created from exploratory analysis.
Gene selection according to Biotype already performed
SE_Bio <- readRDS(paste0(params$SE_Bio))Duplicated gene names are dropped and gene IDs are set as rownames.
The number of duplicated GeneName is: 11899
The number of duplicated Ensembl Genes with version is: 0
SE_Bio <- SE_Bio[!duplicated(rowData(SE_Bio)$GeneName), ]
rownames(SE_Bio) <- rowData(SE_Bio)$GeneName
SE_Bio
## class: SummarizedExperiment
## dim: 24708 36
## metadata(0):
## assays(1): counts
## rownames(24708): TSPAN6 TNMD ... LNCDAT CDR1
## rowData names(9): Gene EnsGene ... Start End
## colnames(36): OLE_001 OLE_002 ... OLE_048 OLE_049
## colData names(11): SampleNumber SampleID_GU ... HRID lib.sizeScaledCols <- c('darkblue', "purple","white","lightgoldenrod1", 'goldenrod1')
padj_threshold = params$padj_threshold
log2fc_threshold = params$log2fc_thresholdOnly iPSCs are kept
SE_Bio_d25CbO <- SE_Bio[, colData(SE_Bio)$Timepoint %in% 'd0']
colData(SE_Bio_d25CbO)
## DataFrame with 9 rows and 11 columns
## SampleNumber SampleID_GU SampleID_OG SampleName SampleType Genotype
## <integer> <character> <character> <character> <character> <character>
## OLE_001 1 IPS_X_WT IPS_X_WT iPSCs_CHD2_WT_p63_10.. iPSCs WT
## OLE_002 2 IPS_X_HT IPS_X_HT iPSCs_CHD2_HT_40_111.. iPSCs HT
## OLE_004 3 IPS_K_WT IPS_K_WT iPSCs_CHD2_WT_p64_04.. iPSCs WT
## OLE_005 4 IPS_K_HT IPS_K_HT iPSCs_CHD2_HT_p38_04.. iPSCs HT
## OLE_026 13 OLE_026 OL01 1_iPSC_WTC iPSCs WT
## OLE_027 14 OLE_027 OL02 2_iPSC_WT3 iPSCs WT
## OLE_028 15 OLE_028 OL03 3_iPSC_A11-5 iPSCs AR
## OLE_029 16 OLE_029 OL04 4_iPSC_A5 iPSCs AR
## OLE_030 17 OLE_030 OL05 5_iPSC_HTpull iPSCs HT
## Timepoint Batch SeqRun HRID lib.size
## <character> <character> <integer> <character> <numeric>
## OLE_001 d0 iPSCRound 1 OLE_001 18760237
## OLE_002 d0 iPSCRound 1 OLE_002 19271736
## OLE_004 d0 iPSCRound 1 OLE_004 16535067
## OLE_005 d0 iPSCRound 1 OLE_005 21791289
## OLE_026 d0 Evo1 2 OLE_026 23062896
## OLE_027 d0 Evo1 2 OLE_027 21313277
## OLE_028 d0 Evo1 2 OLE_028 23796763
## OLE_029 d0 Evo1 2 OLE_029 22872619
## OLE_030 d0 Evo1 2 OLE_030 21414146CountMatrix <- assays(SE_Bio_d25CbO)$counts
SampleMeta <- DataFrame(colData(SE_Bio_d25CbO))
all(rownames(SampleMeta) == colnames(CountMatrix))
## [1] TRUEdds <- DESeqDataSetFromMatrix(countData=assays(SE_Bio_d25CbO)$counts, DataFrame(colData(SE_Bio_d25CbO)), design = ~Batch+Genotype)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in design formula are
## characters, converting to factors
mcols(dds) <- DataFrame(mcols(dds), DataFrame(rowData(SE_Bio_d25CbO)))
dds$Genotype <- factor(dds$Genotype, levels = c("WT", "AR", 'HT')) #no need to specify as column is already ordered, but safer
dds$Genotype <- relevel(dds$Genotype, ref = "WT")
dds$Genotype
## [1] WT HT WT HT WT WT AR AR HT
## Levels: WT AR HT
dds
## class: DESeqDataSet
## dim: 24708 9
## metadata(1): version
## assays(1): counts
## rownames(24708): TSPAN6 TNMD ... LNCDAT CDR1
## rowData names(9): Gene EnsGene ... Start End
## colnames(9): OLE_001 OLE_002 ... OLE_029 OLE_030
## colData names(11): SampleNumber SampleID_GU ... HRID lib.sizeZeroPlot <- CountMatrix %>%
mutate(row = row_number()) %>%
pivot_longer(cols = -row, names_to = "col", values_to = "value") %>%
filter(value == 0) %>%
group_by(col) %>%
summarise(zerocount = n()) %>%
ggplot(., aes(y=col, x=zerocount)) +
geom_col(col='black', fill='#76c8c8') +
coord_flip() +
geom_label(aes(label=zerocount)) +
labs(title=paste0('Number of genes with zero counts ', '(out of ', nrow(CountMatrix), ' genes)'),
y='', x='') +
theme_bw() +
theme(axis.text = element_text(colour = 'black', size=7),
axis.text.x = element_text(angle=45, hjust = 0.5, vjust=0.5),
plot.title = element_text(hjust = 0.5, size = 7))
ZeroPlotkeep <- rowSums(counts(dds)>=5) >= 2 #changed from 5 ##changed from 10 and 2 samples from 3
table(keep)
## keep
## FALSE TRUE
## 7727 16981
dds <- dds[keep,]
dds
## class: DESeqDataSet
## dim: 16981 9
## metadata(1): version
## assays(1): counts
## rownames(16981): TSPAN6 TNMD ... C8orf44 PDCD6-AHRR
## rowData names(9): Gene EnsGene ... Start End
## colnames(9): OLE_001 OLE_002 ... OLE_029 OLE_030
## colData names(11): SampleNumber SampleID_GU ... HRID lib.sizeA dds object containing information about 16981 genes in 9 samples is tested for differential expression.
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
sizeFactors(dds)
## OLE_001 OLE_002 OLE_004 OLE_005 OLE_026 OLE_027 OLE_028 OLE_029 OLE_030
## 0.8770382 0.8787711 0.7744089 0.9846501 1.1871144 1.0970942 1.1832126 1.1512840 1.0245872select <- order(rowMeans(counts(dds,normalized=TRUE)),
decreasing=TRUE)[1:50]
ntd <- normTransform(dds)
df <- as.data.frame(colData(dds)[,c("Genotype")])
colnames(df) <- 'Genotype'
#df$Run <- c(rep('rep1', 3), rep('rep2', 3))
rownames(df) <- colnames(ntd)
pheatmap(assay(ntd)[select,], cluster_rows=FALSE, show_rownames=FALSE,
cluster_cols=TRUE, annotation_col=df, border_color = 'black')vsd <- vst(dds, blind=FALSE)
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
#rownames(sampleDistMatrix) <- paste(vsd$condition, vsd$type, sep="-")
#colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette(rev(RColorBrewer::brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors, main = 'Samples Distances', name = 'vst')SF <- data.frame(Sample=names(sizeFactors(dds)), SizeF=sizeFactors(dds))
par(mfrow=c(2,2),cex.lab=0.7)
boxplot(log2(counts(dds)), col=ScaledCols, cex.axis=0.7,
las=1, xlab="log2(counts)", horizontal=TRUE, main="Raw counts")
boxplot(log2(counts(dds, normalized=TRUE)), col=ScaledCols, cex.axis=0.7,
las=1, xlab="log2(normalized counts)", horizontal=TRUE, main="Normalized counts")
plotDensity(log2(counts(dds)), col=ScaledCols,
xlab="log2(counts)", cex.lab=0.7, panel.first=grid())
plotDensity(log2(counts(dds, normalized=TRUE)), col=ScaledCols,
xlab="log2(normalized counts)", cex.lab=0.7, panel.first=grid()) plotDispEsts(dds)res_dds_ht <- results(dds, contrast=c("Genotype","HT","WT"), alpha=0.05, cooksCutoff=0.99)
summary(res_dds_ht)
##
## out of 16981 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 1976, 12%
## LFC < 0 (down) : 1673, 9.9%
## outliers [1] : 0, 0%
## low counts [2] : 0, 0%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
metadata(res_dds_ht)$filterThreshold
## 0%
## 1.060529
metadata(res_dds_ht)$alpha
## [1] 0.05
plot(metadata(res_dds_ht)$filterNumRej,
type="b", ylab="number of rejections",
xlab="quantiles of filter", main='Heterozygous',
cex.lab=0.5, cex.axis=0.5, cex.main=0.5)
lines(metadata(res_dds_ht)$lo.fit, col="red")
abline(v=metadata(res_dds_ht)$filterTheta)head(res_dds_ht[order(res_dds_ht$padj),])
## log2 fold change (MLE): Genotype HT vs WT
## Wald test p-value: Genotype HT vs WT
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## PDGFA 1371.073 3.08521 0.132934 23.2086 3.72933e-119 6.33277e-115
## UTF1 620.684 5.44024 0.247564 21.9751 4.98608e-107 4.23343e-103
## ECEL1 369.708 3.78976 0.182203 20.7997 4.35644e-96 2.46589e-92
## ZNF562 396.203 -3.43119 0.179106 -19.1574 8.40246e-82 3.56705e-78
## VENTX 411.739 3.83143 0.203351 18.8414 3.45413e-79 1.17309e-75
## MLLT6 1254.439 2.08808 0.119621 17.4557 3.11301e-68 8.81035e-65Genes modulated considering a FDR threshold of 0.1: 4550
Genes modulated considering a FDR threshold of 0.05: 3649
Genes modulated considering a FDR threshold of 0.05 and FC threshold of 1.5: 2574
Genes modulated considering a FDR threshold of 0.05 and FC threshold of 2: 1639
Since I am using the constrast option to retrieve the results, I cannot rely on apleglm default algorithm for logFC shrinkage. Since at the moment I am not using the lfcThreshold option, I decide for the ashr algorithm.
resAshr_ht <- lfcShrink(dds, contrast=c("Genotype","HT","WT"), res=res_dds_ht, type="ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
summary(resAshr_ht)
##
## out of 16981 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 1976, 12%
## LFC < 0 (down) : 1673, 9.9%
## outliers [1] : 0, 0%
## low counts [2] : 0, 0%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results#par(mfrow=c(1,2), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-6,6)
DESeq2::plotMA(res_dds_ht, xlim=xlim, ylim=ylim, main="no LFC shrink")DESeq2::plotMA(resAshr_ht, xlim=xlim, ylim=ylim, main="LFC shrink with ashr algorithm")head(resAshr_ht[order(resAshr_ht$padj),])
## log2 fold change (MMSE): Genotype HT vs WT
## Wald test p-value: Genotype HT vs WT
## DataFrame with 6 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## PDGFA 1371.073 3.06714 0.133478 3.72933e-119 6.33277e-115
## UTF1 620.684 5.41367 0.247868 4.98608e-107 4.23343e-103
## ECEL1 369.708 3.76642 0.183406 4.35644e-96 2.46589e-92
## ZNF562 396.203 -3.40353 0.180476 8.40246e-82 3.56705e-78
## VENTX 411.739 3.80289 0.204973 3.45413e-79 1.17309e-75
## MLLT6 1254.439 2.06906 0.119762 3.11301e-68 8.81035e-65Genes modulated considering a FDR threshold of 0.1: 4550
Genes modulated considering a FDR threshold of 0.05: 3649
Genes modulated considering a FDR threshold of 0.05 and FC threshold of 1.5: 2164
Genes modulated considering a FDR threshold of 0.05 and FC threshold of 2: 1340
Log Fold change shrinkage doesn’t change the results significantly, thus I decided to keep the standard DESeq2 workflow for testing differential gene expression.
deseqDEGsHT <- dplyr::filter(data.frame(res_dds_ht), padj < padj_threshold, abs(log2FoldChange) > log2(log2fc_threshold))
deseqDEGsHTashr <- dplyr::filter(data.frame(resAshr_ht), padj < padj_threshold, abs(log2FoldChange) > log2(log2fc_threshold))SE_deseq2 <- as(dds, "RangedSummarizedExperiment")
assays(SE_deseq2)$vst <- assay(vst(dds, blind=TRUE))FdrCeil = 1e-10
logFcCeil = 7.5#rename(as.data.frame(res_dds_ht), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = 0.05, LabellingCutoff = 0.01, FCCutoff = 2, main = 'HT vs WT')
dplyr::rename(as.data.frame(res_dds_ht), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = padj_threshold, LabellingCutoff = 0.01, FCCutoff = log2fc_threshold, main = 'HT vs WT (iPSCs CO)') + labs(y='Log FoldChange') + xlim(0, -log10(FdrCeil)) + ylim(-logFcCeil, logFcCeil)#rename(as.data.frame(res_dds_ht), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = 0.05, LabellingCutoff = 0.01, FCCutoff = 2, main = 'HT vs WT')
dplyr::rename(as.data.frame(resAshr_ht), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = padj_threshold, LabellingCutoff = 0.01, FCCutoff = log2fc_threshold, main = 'HT vs WT (iPSCs CO)') + labs(y='Log FoldChange') + xlim(0, -log10(FdrCeil)) + ylim(-logFcCeil, logFcCeil)DEAList <- list()
DEAList <- list(dds = dds, #same for both
HT = list(res = res_dds_ht,
DEGs = deseqDEGsHT,
resAshr =resAshr_ht,
DEGsAshr = deseqDEGsHTashr))# RDS
saveRDS(DEAList, paste0(SavingFolder, '/ipsc.', 'DEAList_HT.rds'))
saveRDS(SE_deseq2, paste0(SavingFolder, '/ipsc.', 'SE_deseq2_HT.rds'))
saveRDS(res_dds_ht, paste0(SavingFolder, '/ipsc.', 'deseqHTvsWT.rds'))
SessionInfo <- sessionInfo()
Date <- date()
save.image(paste0(SavingFolder, '/ipsc.', 'DEAnalysisWorkspace_HT.RData'))